[1] "2.34.1"
modelling workflow for risk and safety
prepared for the Health & Safety Executive, Science Division
13 Feb 2025
BUGS/JAGS to StanStan, which tunes hyperparameters during warmup!BUGS/JAGS to StanHoffman, M.D., Gelman, A., ‘The No-U-Turn Sampler’, JMLR, 2014
BUGS/JAGS to StanStanCon anyone?
‘democratising scalable UQ’ …once installed 🤔
looking for evidence of active corrosion growth
3×5 DataFrame
Row │ anomaly_id soil_type inspection measured_depth_mm T
│ Int64 String1 String3 Float64 Int64
─────┼─────────────────────────────────────────────────────────────
1 │ 1 A i_0 0.0672091 0
2 │ 2 A i_0 0.104202 0
3 │ 3 A i_0 0.100588 0
data {
int<lower=0> n_anomalies; // number of anomalies
int<lower=0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower=0> mu; // mean growth rate
real<lower=0> sigma; // standard deviation
}
model {
// model
for (i in 1 : (n_anomalies * n_inspections)) {
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
cgr ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
/*
vector[n_anomalies * n_inspections] log_lik;
real cgr_pred = normal_rng(mu, sigma);
for (i in 1:(n_anomalies * n_inspections)) {
log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
*/
}
INFO:cmdstanpy:found newer exe file, not recompiling
data {
int <lower = 0> n_anomalies; // number of anomalies
int <lower = 0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower = 0> mu; // mean growth rate
real<lower = 0> sigma; // standard deviation
}
model {
// model
for(i in 1:(n_anomalies * n_inspections)){
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
cgr ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
/*
vector[n_anomalies * n_inspections] log_lik;
real cgr_pred = normal_rng(mu, sigma);
for (i in 1:(n_anomalies * n_inspections)) {
log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
*/
}
CmdStanR needs it’s input data as a list
prepare_data <- function(df = corrosion_data) {
df |>
arrange(anomaly_id, T) |>
group_by(anomaly_id) |>
mutate(
next_depth = lead(measured_depth_mm),
time_diff = lead(T) - T
) |>
filter(!is.na(next_depth)) |>
mutate(
delta_C = (next_depth - measured_depth_mm) / time_diff
) |>
select(anomaly_id, delta_C, soil_type) |>
ungroup()
}
model_data <- list(
n_anomalies = prepare_data()$anomaly_id |> unique() |> length(),
n_inspections = 2,
cgr = prepare_data()$delta_C
)
cgr_post <- cgr_model$sample(data = model_data)Running MCMC with 4 sequential chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 0.0 seconds.
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 0.0 seconds.
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 0.0 seconds.
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 finished in 0.0 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.6 seconds.
CmdStanPy needs it’s input data as a dictionary
def prepare_data(df = corrosion_data):
return (
df.sort(['anomaly_id', 'T'])
.group_by('anomaly_id')
.agg([
pl.col('measured_depth_mm').shift(-1).alias('next_depth'),
pl.col('T').shift(-1).alias('next_time'),
pl.col('measured_depth_mm'),
pl.col('T')
])
.filter(pl.col('next_depth').is_not_null())
.with_columns([
((pl.col('next_depth') - pl.col('measured_depth_mm')) /
(pl.col('next_time') - pl.col('T'))).alias('delta_C')
])
.select(['anomaly_id', 'delta_C'])
.explode('delta_C') # Add this line to unnest the lists
.filter(pl.col('delta_C').is_not_null()) # Optional: remove null values if any
)
model_data = {
'n_anomalies': prepare_data().select('anomaly_id').unique().height,
'n_inspections': 2,
'cgr': prepare_data().select('delta_C').to_series().to_numpy()
}
cgr_post = cgr_model.sample(data = model_data)
INFO:cmdstanpy:CmdStan start processing
chain 1 |[33m [0m| 00:00 Status
chain 2 |[33m [0m| 00:00 Status[A
chain 3 |[33m [0m| 00:00 Status[A[A
chain 4 |[33m [0m| 00:00 Status[A[A[A
chain 1 |[34m##########[0m| 00:00 Sampling completed
chain 2 |[34m##########[0m| 00:00 Sampling completed
chain 3 |[34m##########[0m| 00:00 Sampling completed
chain 4 |[34m##########[0m| 00:00 Sampling completed
INFO:cmdstanpy:CmdStan done processing.
A Turing model needs it’s input data as arguments to the model function
function prepare_data(df::DataFrame = corrosion_data)
sorted_df = sort(df, [:anomaly_id, :T]); result = DataFrame()
for group in groupby(sorted_df, :anomaly_id)
if nrow(group) > 1
for i in 1:(nrow(group)-1)
Δc = (group[i+1, :measured_depth_mm] - group[i, :measured_depth_mm]) / (group[i+1, :T] - group[i, :T])
push!(result, (
anomaly_id = group[i, :anomaly_id], Δc = max(0, Δc)
))
end
end
end
return result
end
cgr_post = prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, NUTS(), 1_000)# A draws_df: 1000 iterations, 4 chains, and 3 variables
lp__ mu sigma
1 130 0.082 0.036
2 130 0.081 0.037
3 130 0.083 0.043
4 126 0.071 0.052
5 129 0.076 0.045
6 127 0.066 0.046
7 130 0.079 0.041
8 130 0.085 0.037
9 130 0.078 0.039
10 130 0.086 0.043
# ... with 3990 more draws
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
# A tibble: 8,000 × 4
Parameter Chain Iteration value
<chr> <int> <int> <dbl>
1 mu 1 1 0.0822
2 mu 1 2 0.0812
3 mu 1 3 0.0827
4 mu 1 4 0.0712
5 mu 1 5 0.0759
6 mu 1 6 0.0663
7 mu 1 7 0.0792
8 mu 1 8 0.0850
9 mu 1 9 0.0776
10 mu 1 10 0.0863
# ℹ 7,990 more rows
array([[[ 1.30151e+02, 1.00000e+00, 6.85962e-01, ..., -1.29402e+02,
7.78052e-02, 4.24306e-02],
[ 1.29889e+02, 9.34442e-01, 7.36039e-01, ..., -1.28437e+02,
7.61124e-02, 3.85203e-02],
[ 1.30198e+02, 8.44682e-01, 8.67983e-01, ..., -1.28217e+02,
8.54163e-02, 3.83714e-02],
[ 1.30380e+02, 9.36487e-01, 7.91913e-01, ..., -1.26226e+02,
8.08443e-02, 4.23246e-02]],
[[ 1.29959e+02, 9.79152e-01, 6.85962e-01, ..., -1.29741e+02,
7.97090e-02, 4.46100e-02],
[ 1.30273e+02, 9.91006e-01, 7.36039e-01, ..., -1.29721e+02,
7.78616e-02, 3.98464e-02],
[ 1.30152e+02, 8.30931e-01, 8.67983e-01, ..., -1.28874e+02,
7.82429e-02, 4.28051e-02],
[ 1.29713e+02, 8.70971e-01, 7.91913e-01, ..., -1.29174e+02,
7.46756e-02, 3.98653e-02]],
[[ 1.29521e+02, 9.66414e-01, 6.85962e-01, ..., -1.29004e+02,
8.89830e-02, 3.82531e-02],
[ 1.30255e+02, 9.88834e-01, 7.36039e-01, ..., -1.30059e+02,
8.59262e-02, 3.97641e-02],
[ 1.30195e+02, 1.00000e+00, 8.67983e-01, ..., -1.30068e+02,
7.74592e-02, 4.14395e-02],
[ 1.29802e+02, 1.00000e+00, 7.91913e-01, ..., -1.29633e+02,
7.50011e-02, 4.03275e-02]],
...,
[[ 1.30308e+02, 9.63281e-01, 6.85962e-01, ..., -1.29387e+02,
7.81636e-02, 4.08331e-02],
[ 1.30327e+02, 9.16243e-01, 7.36039e-01, ..., -1.28799e+02,
7.83144e-02, 3.98632e-02],
[ 1.29505e+02, 5.03013e-01, 8.67983e-01, ..., -1.26257e+02,
7.38571e-02, 4.21486e-02],
[ 1.30385e+02, 9.85208e-01, 7.91913e-01, ..., -1.29876e+02,
7.90462e-02, 3.94353e-02]],
[[ 1.30290e+02, 9.30435e-01, 6.85962e-01, ..., -1.29247e+02,
8.47580e-02, 3.85130e-02],
[ 1.30514e+02, 1.00000e+00, 7.36039e-01, ..., -1.30347e+02,
8.12681e-02, 4.06528e-02],
[ 1.29510e+02, 9.81999e-01, 8.67983e-01, ..., -1.28867e+02,
7.51931e-02, 4.42580e-02],
[ 1.29409e+02, 8.95041e-01, 7.91913e-01, ..., -1.29353e+02,
8.97024e-02, 4.36003e-02]],
[[ 1.27707e+02, 6.22679e-01, 6.85962e-01, ..., -1.25941e+02,
9.37749e-02, 3.74611e-02],
[ 1.30394e+02, 9.70471e-01, 7.36039e-01, ..., -1.30205e+02,
8.43637e-02, 3.92485e-02],
[ 1.30155e+02, 9.46785e-01, 8.67983e-01, ..., -1.28688e+02,
8.30171e-02, 3.70230e-02],
[ 1.29778e+02, 9.53129e-01, 7.91913e-01, ..., -1.28328e+02,
8.55217e-02, 4.49969e-02]]])
lp__ accept_stat__ stepsize__ ... energy__ mu sigma
0 130.151 1.000000 0.685962 ... -129.402 0.077805 0.042431
1 129.959 0.979152 0.685962 ... -129.741 0.079709 0.044610
2 129.521 0.966414 0.685962 ... -129.004 0.088983 0.038253
3 129.653 1.000000 0.685962 ... -128.844 0.079044 0.045779
4 130.437 1.000000 0.685962 ... -129.651 0.080300 0.038906
... ... ... ... ... ... ... ...
3995 129.870 0.936764 0.791913 ... -128.660 0.077974 0.036835
3996 130.059 1.000000 0.791913 ... -129.848 0.079485 0.036906
3997 130.385 0.985208 0.791913 ... -129.876 0.079046 0.039435
3998 129.409 0.895041 0.791913 ... -129.353 0.089702 0.043600
3999 129.778 0.953129 0.791913 ... -128.328 0.085522 0.044997
[4000 rows x 9 columns]
1000×16 DataFrame
Row │ iteration chain μ σ lp n_steps is_accept a ⋯
│ Int64 Int64 Float64 Float64 Float64 Float64 Float64 F ⋯
──────┼─────────────────────────────────────────────────────────────────────────
1 │ 501 1 0.0776528 0.0409793 86.0414 3.0 1.0 ⋯
2 │ 502 1 0.0776528 0.0409793 86.0414 3.0 1.0
3 │ 503 1 0.078073 0.0486141 85.5279 3.0 1.0
4 │ 504 1 0.0722796 0.0470381 85.5204 3.0 1.0
5 │ 505 1 0.0722796 0.0470381 85.5204 3.0 1.0 ⋯
6 │ 506 1 0.0683701 0.0535426 84.514 3.0 1.0
7 │ 507 1 0.0779788 0.0502551 85.2467 3.0 1.0
8 │ 508 1 0.0777926 0.0382539 85.7037 3.0 1.0
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
994 │ 1494 1 0.0712522 0.0466391 85.4336 3.0 1.0 ⋯
995 │ 1495 1 0.0795906 0.0384015 85.838 3.0 1.0
996 │ 1496 1 0.0824833 0.0384926 85.8122 5.0 1.0
997 │ 1497 1 0.0824833 0.0384926 85.8122 1.0 1.0
998 │ 1498 1 0.0813499 0.0451814 85.868 3.0 1.0 ⋯
999 │ 1499 1 0.0813499 0.0451814 85.868 3.0 1.0
1000 │ 1500 1 0.0611499 0.050858 83.7136 3.0 1.0
9 columns and 985 rows omitted
success
next up: how we can extend this to a robust and helpful workflow
☕
Turing.jln_draws = 1_000; n_chains = 4
# a no U-turn sampler, with 2000 adaptive steps and a target acceptance rate of 0.65
NUTS_sampler = NUTS(2_000, 0.65)
# a Hamiltonian Monte Carlo sampler, with a step size of 0.05 and 10 leapfrog steps
HMC_sampler = HMC(0.05, 10)
# a Metropolis-Hastings sampler, using the default proposal distribution (priors)
MH_sampler = MH()
# a 'compositional' Gibbs sampler (Metropolis within Gibbs) - sampling μ with MH and σ with NUTS
Gibbs_sampler = Gibbs(MH(:μ), NUTS(2_000, 0.65, :σ))
run_mcmc = function(sampler)
return prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, sampler, MCMCThreads(), n_draws, n_chains)
end